library(aplore3)
library(ggplot2)
library(plotly)
library(RColorBrewer)
library(pheatmap)
library(cluster)
library(ggcorrplot)
library(dplyr)
library(tidyr)
library(sjPlot)
library(sjmisc)

Data Format: A data.frame with 500 rows and 18 variables such as:

priorfrac - If the patient previously had a fracture
age
weight
height
bmi
premeno
momfrac
armassist
smoke
raterisk
fracscore
fracture
bonemed - Bone medications at enrollment (1: No, 2: Yes)
bonemed_fu - Bone medications at follow-up (1: No, 2: Yes)
bonetreat - Bone medications both at enrollment and follow-up (1: No, 2: Yes)

head(glow_bonemed)
##   sub_id site_id phy_id priorfrac age weight height      bmi premeno momfrac
## 1      1       1     14        No  62   70.3    158 28.16055      No      No
## 2      2       4    284        No  65   87.1    160 34.02344      No      No
## 3      3       6    305       Yes  88   50.8    157 20.60936      No     Yes
## 4      4       6    309        No  82   62.1    160 24.25781      No      No
## 5      5       1     37        No  61   68.0    152 29.43213      No      No
## 6      6       5    299       Yes  67   68.0    161 26.23356      No      No
##   armassist smoke raterisk fracscore fracture bonemed bonemed_fu bonetreat
## 1        No    No     Same         1       No      No         No        No
## 2        No    No     Same         2       No      No         No        No
## 3       Yes    No     Less        11       No      No         No        No
## 4        No    No     Less         5       No      No         No        No
## 5        No    No     Same         1       No      No         No        No
## 6        No   Yes     Same         4       No      No         No        No

Summary statistics for numeric variables

mysummary = glow_bonemed %>%
  select(age, weight, height, bmi, fracscore) %>%
  summarise_each(
    funs(min = min, 
    q25 = quantile(., 0.25), 
    median = median, 
    q75 = quantile(., 0.75), 
    max = max,
    mean = mean, 
    sd = sd,
    variance= var))
# reshape it using tidyr functions
clean.summary = mysummary %>% 
  gather(stat, val) %>%
  separate(stat, into = c("var", "stat"), sep = "_") %>%
  spread(stat, val) %>%
  select(var, min, max, mean, sd, variance)
print(clean.summary)
##         var       min       max      mean        sd   variance
## 1       age  55.00000  90.00000  68.56200  8.989537  80.811780
## 2       bmi  14.87637  49.08241  27.55303  5.973958  35.688178
## 3 fracscore   0.00000  11.00000   3.69800  2.495446   6.227251
## 4    height 134.00000 199.00000 161.36400  6.355493  40.392289
## 5    weight  39.90000 127.00000  71.82320 16.435992 270.141825

Summary statistics for categorical variables

summary(glow_bonemed %>% select(priorfrac, premeno, momfrac, armassist, smoke, raterisk, fracture, bonemed, bonemed_fu, bonetreat))
##  priorfrac premeno   momfrac   armassist smoke        raterisk   fracture 
##  No :374   No :403   No :435   No :312   No :465   Less   :167   No :375  
##  Yes:126   Yes: 97   Yes: 65   Yes:188   Yes: 35   Same   :186   Yes:125  
##                                                    Greater:147            
##  bonemed   bonemed_fu bonetreat
##  No :371   No :361    No :382  
##  Yes:129   Yes:139    Yes:118  
## 

No missing values

colSums(is.na(glow_bonemed))
##     sub_id    site_id     phy_id  priorfrac        age     weight     height 
##          0          0          0          0          0          0          0 
##        bmi    premeno    momfrac  armassist      smoke   raterisk  fracscore 
##          0          0          0          0          0          0          0 
##   fracture    bonemed bonemed_fu  bonetreat 
##          0          0          0          0
sum(is.na(glow_bonemed))
## [1] 0
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.3.3
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
# different way to present no missing values
# kable Extra library to make document more presentable
colSums(is.na(glow_bonemed)) %>%  
  kable("html", caption = "No missing values") %>% 
  kable_styling()
No missing values
x
sub_id 0
site_id 0
phy_id 0
priorfrac 0
age 0
weight 0
height 0
bmi 0
premeno 0
momfrac 0
armassist 0
smoke 0
raterisk 0
fracscore 0
fracture 0
bonemed 0
bonemed_fu 0
bonetreat 0

Age vs Weight: As weight increases the average age decreases
Age vs Height: Weak correlation of as height increases age decreases
Age vs BMI: As bmi increases the average age decreases
Age vs fracscore: As age increases the average fracscore increases

Weight vs Height: As height increases the average weight increases
Weight vs BMI: As bmi increases the average weight increases
Weight vs fracscore: As fracscore increases the average Weight decreases

Height vs BMI: As bmi increases the average height and variance stay the same
Height vs fracscore: As fracscore increases the average height stays the same though variance might decrease

BMI vs fracscore: As fracscore increases the average bmi decreases

plot(glow_bonemed[, c(5:8, 14)])

Non of the following scatter plots show strong groupings for the fracture/no fracture categorical variable

ggplot(glow_bonemed, aes(x = age, y = bmi, color = fracture)) +
  geom_jitter() +
  labs(title = "BMI vs Age")

ggplot(glow_bonemed, aes(x = bmi, y = fracscore, color = fracture)) +
  geom_jitter() +
  labs(title = "Fracture Score vs BMI")

ggplot(glow_bonemed, aes(x = fracscore, y = age, color = fracture)) +
  geom_jitter() +
  labs(title = "Age vs Fracture Score")

ggplot(glow_bonemed, aes(x = weight, y = height, color = fracture)) +
  geom_jitter() +
  labs(title = "Height vs Weight")

Once again there doesn’t seem to be strong groupings of the fracture categorical variable

fracture3dplot = plot_ly(glow_bonemed, 
  x = ~age, 
  y = ~height, 
  z = ~bmi, 
  color = ~fracture, 
  colors = c('#0C4B8E', '#BF382A')) %>% add_markers()
fracture3dplot

There are so little “yes” fracture results that the plot isn’t very useful

## `geom_smooth()` using formula = 'y ~ x'

The boxplot shows that the mean fracscore seems to be slightly higher for smokers compared to non smokers

ggplot(glow_bonemed, aes(x = smoke, y = fracscore)) +
  geom_boxplot() +
  labs(title = "Fracture Score Summary Statistics for Smokers vs Non Smokers")

Plot confirms there is a strong correlation between age/fracscore, bmi/weight

corr_vars <- c("age", "weight", "height", "bmi", "fracscore")
corr_df <- glow_bonemed[, corr_vars]
corr_df <- cor(corr_df)
ggcorrplot(corr = corr_df, lab = TRUE, lab_size = 2,
  colors = c("#6D9EC1", "white", "#E46726")) +
  labs(title = "Correlation Between Variables") +
  theme(plot.title = element_text(hjust = .5),
  plot.subtitle = element_text(hjust = .5))

Clustering EDA

pheatmap(glow_bonemed[, c(5,8)], scale = "column", fontsize_row = 0.1, cluster_cols = F, legend = T, color = colorRampPalette(c("blue", "white", "red"), space = "rgb")(100))

pheatmap(glow_bonemed[, 5:8], scale = "column", fontsize_row = 0.1, cluster_cols = F, legend = T, color = colorRampPalette(c("blue", "white", "red"), space = "rgb")(100))

zScoreScale = scale(glow_bonemed[, 5:8])
zScoreDistance = dist(zScoreScale)
continuousVariableClustering = hclust(zScoreDistance, method = "complete")
plot(continuousVariableClustering)

Modeling

Split the data into a training/testing set

set.seed(4)
trainingIndices = sample(c(1:dim(glow_bonemed)[1]), dim(glow_bonemed)[1]*0.8)
trainingDataframe = glow_bonemed[trainingIndices,]
testingDataframe = glow_bonemed[-trainingIndices,]

Age is the only statistically significant continuous variable at the alpha = 0.2 level (p < 0.0001)

model = glm(fracture ~ age + weight + height + bmi, data = glow_bonemed, family = "binomial")
summary(model)
## 
## Call:
## glm(formula = fracture ~ age + weight + height + bmi, family = "binomial", 
##     data = glow_bonemed)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -13.29208   12.54412  -1.060    0.289    
## age           0.05263    0.01237   4.255 2.09e-05 ***
## weight       -0.09720    0.08559  -1.136    0.256    
## height        0.04914    0.07747   0.634    0.526    
## bmi           0.27450    0.22072   1.244    0.214    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 562.34  on 499  degrees of freedom
## Residual deviance: 532.92  on 495  degrees of freedom
## AIC: 542.92
## 
## Number of Fisher Scoring iterations: 4
AIC(model)
## [1] 542.9224
library(ResourceSelection) 
## Warning: package 'ResourceSelection' was built under R version 4.3.3
## ResourceSelection 0.3-6   2023-06-27
hoslem.test(model$y,fitted(model)) # shows non-significant test result which means this is a decent model fit
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  model$y, fitted(model)
## X-squared = 11.39, df = 8, p-value = 0.1806
# get odds ratio for model
exp((model$coefficients)) 
##  (Intercept)          age       weight       height          bmi 
## 1.687806e-06 1.054042e+00 9.073722e-01 1.050367e+00 1.315867e+00
# get confidence intervals
exp(confint(model))
## Waiting for profiling to be done...
##                    2.5 %       97.5 %
## (Intercept) 3.277769e-17 76023.505396
## age         1.029041e+00     1.080263
## weight      7.644329e-01     1.070088
## height      9.024284e-01     1.222733
## bmi         8.597478e-01     2.047437
# trying to figure out how to use sjPlot to mimic what we did in unit12 prelive
#plot_model(model, type = "pred", terms = c("age", "smoke"))
corr_vars <- c("age", "weight", "height", "bmi", "fracscore")
pc.result<-prcomp(glow_bonemed[, corr_vars],scale.=TRUE)
#Eigen Vectors
pc.result$rotation
##                  PC1         PC2         PC3         PC4          PC5
## age        0.4947219  0.46742140 -0.15246583  0.71654567 -0.009160237
## weight    -0.5273035  0.46578775 -0.08840991  0.03240244 -0.704362523
## height    -0.2345770 -0.08196149 -0.93823245  0.01885633  0.240042129
## bmi       -0.4741030  0.51615173  0.24533677  0.05137380  0.667820563
## fracscore  0.4442985  0.53984137 -0.16872342 -0.69463484  0.013601399
#Eigen Values
eigenvals<-pc.result$sdev^2
eigenvals
## [1] 2.374116591 1.523507236 0.975710317 0.123469514 0.003196342
#Scree plot
par(mfrow = c(1,2))
plot(eigenvals,type = "l",
     main = "Scree Plot",
     ylab = "Eigen Values",
     xlab = "PC #")
plot(eigenvals / sum(eigenvals), 
     type = "l", main = "Scree Plot", 
     ylab = "Prop. Var. Explained", 
     xlab = "PC #", 
     ylim = c(0, 1))
cumulative.prop = cumsum(eigenvals / sum(eigenvals))
lines(cumulative.prop, lty = 2)

# Loess curve for fracscore by bonetreatment group showing fracture or not
glow_bonemed$bonetreat.num <- ifelse(glow_bonemed$bonetreat == "No", 0, 1)
ggplot(glow_bonemed, aes(x = fracscore, y = bonetreat.num, color = fracture)) +
  geom_jitter()+ 
  geom_smooth(method="loess",size=1,span=1)+
  ylim(-.2,1.2) +
  xlab("Fracture Score") +
  ylab("Received bonetreatment at both iterations") +
  ggtitle("Difference in Fracture Score vs bonetreatment at both time points")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 124 rows containing missing values (`geom_point()`).

# shows that there is not an increased risk, i.e. no changes, in likelihood of fracture, whereas only receiving one or no treatments trends to increase the likelihood of a fracture as the fracture score goes up

# plot breaking down to see if there is any separation
ggplot(glow_bonemed, aes(x = fracscore, y = bonetreat, color = fracture)) +
  geom_jitter()

# in bonetreat, i.e. bone meds at both time points, in the no group, there appear to be higher fracture rates with increased fracscore, which would be predicted, i.e. if you received treatment at both times there doesn't appear to be a correlation in fracscore and breaking a bone (fracture), vs the group that did not receive both treatments appears to be a correlation with a higher likelihood correlating to likelihood of fracture
# Loess curve for fracscore by physician group showing fracture or not
table(glow_bonemed$priorfrac) # show table of prior fractures
## 
##  No Yes 
## 374 126
ggplot(glow_bonemed, aes(x = fracscore, y = ifelse(glow_bonemed$fracture == "No", 0, 1), color = priorfrac)) +
  geom_jitter()+ 
  geom_smooth(method="loess",size=1,span=1)+
  ylim(-.2,1.2) +
  xlab("Fracture Score") +
  ylab("Fracture during study") +
  ggtitle("Fracture Score vs Fracture during study within prior fracture groups")
## Warning: Use of `glow_bonemed$fracture` is discouraged.
## ℹ Use `fracture` instead.
## Use of `glow_bonemed$fracture` is discouraged.
## ℹ Use `fracture` instead.
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 129 rows containing missing values (`geom_point()`).

# shows that there is not an increased risk associated with higher fracscore, i.e. no changes, in likelihood of an increased fracture if you previous had a fracture  whereas the group that has never had a fracture tends to increase the likelihood of a fracture as the fracture score goes up

# plot breaking down to see if there is any separation
ggplot(glow_bonemed, aes(x = bonetreat, y = ifelse(glow_bonemed$priorfrac == "No", 0, 1), color = fracture)) +
  geom_jitter() +
  xlab("Both Bone treatments") +
  ylab("Had Prior Fracture") +
  ggtitle("Bone treament vs prior fracture within fracture during study groups")
## Warning: Use of `glow_bonemed$priorfrac` is discouraged.
## ℹ Use `priorfrac` instead.

library(caret)
## Loading required package: lattice
# plot
ggplot(glow_bonemed, aes(x = age, y = ifelse(glow_bonemed$smoke == "No", 0, 1), color = fracture)) +
  geom_jitter()+ 
  geom_smooth(method="loess",size=1,span=1)+
  ylim(-.2,1.2)
## Warning: Use of `glow_bonemed$smoke` is discouraged.
## ℹ Use `smoke` instead.
## Use of `glow_bonemed$smoke` is discouraged.
## ℹ Use `smoke` instead.
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 130 rows containing missing values (`geom_point()`).

library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
set.seed(4)

#note CV and error metric are not really used here, but logLoss is reported for the final model.
# set tuning parameters using logloss
fitControl<-trainControl(method="repeatedcv",number=10,repeats=1,classProbs=TRUE, summaryFunction=mnLogLoss)

# build glmnet model

glmnet.fit<-train(fracture ~ . - sub_id,
                    data=trainingDataframe,
                    method="glmnet",
                    trControl=fitControl,
                    metric="logLoss")
coef(glmnet.fit$finalModel,glmnet.fit$finalModel$lambdaOpt)
## 18 x 1 sparse Matrix of class "dgCMatrix"
##                           s1
## (Intercept)     -0.340276566
## site_id          0.039198564
## phy_id           .          
## priorfracYes     0.350415528
## age              .          
## weight           .          
## height          -0.012466332
## bmi              0.004981546
## premenoYes       .          
## momfracYes       0.370402577
## armassistYes     0.073010553
## smokeYes         .          
## rateriskSame     0.156439065
## rateriskGreater  0.489383198
## fracscore        0.128442276
## bonemedYes       .          
## bonemed_fuYes    0.394532573
## bonetreatYes     .
#Getting predictions for glmnet for Complex model
glmnetfit.predprobs<-predict(glmnet.fit, trainingDataframe ,type="prob")


# glmnet ROC
glmnet.roc<-roc(response= trainingDataframe$fracture, predictor=glmnetfit.predprobs$No,levels=c("No","Yes"))
## Setting direction: controls > cases
plot(glmnet.roc,col="steelblue")

# Save for later
# plot(glmnet.roc,add=T,col="steelblue")
# legend("bottomright",
#      legend=c("Simple", "Complex","GLMNET"),
#      col=c("black", "red","steelblue"),
#      lwd=4, cex =1, xpd = TRUE, horiz = FALSE)


Left out the following variables: bonetreat, bonemed, smoke, premeno, weight, age, phy_id.

# Build complex model with interactions and/or polynomials
complex1 = glm(fracture ~  bmi + bonetreat + fracscore + priorfrac + bonemed + bonemed_fu + priorfrac:fracscore + bmi:fracscore + fracscore:bonetreat, data = trainingDataframe, family = "binomial")
summary(complex1)
## 
## Call:
## glm(formula = fracture ~ bmi + bonetreat + fracscore + priorfrac + 
##     bonemed + bonemed_fu + priorfrac:fracscore + bmi:fracscore + 
##     fracscore:bonetreat, family = "binomial", data = trainingDataframe)
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)   
## (Intercept)            -1.755675   1.360927  -1.290  0.19703   
## bmi                    -0.028581   0.047829  -0.598  0.55013   
## bonetreatYes           -1.686170   1.066664  -1.581  0.11393   
## fracscore              -0.009717   0.273117  -0.036  0.97162   
## priorfracYes            1.380602   0.704983   1.958  0.05019 . 
## bonemedYes              1.266284   0.718496   1.762  0.07800 . 
## bonemed_fuYes           1.534272   0.533329   2.877  0.00402 **
## fracscore:priorfracYes -0.175793   0.121858  -1.443  0.14913   
## bmi:fracscore           0.010442   0.009815   1.064  0.28737   
## bonetreatYes:fracscore -0.109089   0.110192  -0.990  0.32218   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 458.45  on 399  degrees of freedom
## Residual deviance: 409.18  on 390  degrees of freedom
## AIC: 429.18
## 
## Number of Fisher Scoring iterations: 4
AIC(complex1)
## [1] 429.1816
library(ResourceSelection) 
hoslem.test(complex1$y,fitted(complex1)) # shows non-significant test result which means this is a decent model fit
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  complex1$y, fitted(complex1)
## X-squared = 5.3987, df = 8, p-value = 0.7142
# get odds ratio for model
exp((complex1$coefficients)) 
##            (Intercept)                    bmi           bonetreatYes 
##              0.1727906              0.9718236              0.1852275 
##              fracscore           priorfracYes             bonemedYes 
##              0.9903299              3.9772935              3.5476441 
##          bonemed_fuYes fracscore:priorfracYes          bmi:fracscore 
##              4.6379481              0.8387915              1.0104969 
## bonetreatYes:fracscore 
##              0.8966504
# get confidence intervals
exp(confint(complex1))
## Waiting for profiling to be done...
##                             2.5 %    97.5 %
## (Intercept)            0.01199624  2.539720
## bmi                    0.88195989  1.064655
## bonetreatYes           0.02205353  1.491962
## fracscore              0.57419435  1.685052
## priorfracYes           0.98048590 15.791502
## bonemedYes             0.85958345 15.579380
## bonemed_fuYes          1.65454068 13.772257
## fracscore:priorfracYes 0.65990118  1.066216
## bmi:fracscore          0.99162476  1.030752
## bonetreatYes:fracscore 0.72373545  1.117224
# Get Predictions
#Complex model from previous
complex1.predprobs<-predict(complex1,trainingDataframe ,type="response")

# complex model ROC
complex1.roc<-roc(response=trainingDataframe$fracture,predictor=complex1.predprobs,levels=c("No","Yes"))
## Setting direction: controls < cases
# plot ROC
plot(complex1.roc,print.thres="best",col="red", main = "Best threshold for training data set")

# Now check validation in test set
set.seed(4)
validateComplexPred <- predict(complex1, newdata = testingDataframe, type="response")


# check confusion matrix positive class is no fracture
threshold = 0.284
validateComplexPredictions<-factor(ifelse(validateComplexPred>threshold,"No","Yes"))

#Confusion matrix for objective 2 complex model 1 with interactions
confusionMatrix(data = validateComplexPredictions, reference = testingDataframe$fracture, positive="Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  25  16
##        Yes 54   5
##                                           
##                Accuracy : 0.3             
##                  95% CI : (0.2124, 0.3998)
##     No Information Rate : 0.79            
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : -0.2677         
##                                           
##  Mcnemar's Test P-Value : 9.764e-06       
##                                           
##             Sensitivity : 0.23810         
##             Specificity : 0.31646         
##          Pos Pred Value : 0.08475         
##          Neg Pred Value : 0.60976         
##              Prevalence : 0.21000         
##          Detection Rate : 0.05000         
##    Detection Prevalence : 0.59000         
##       Balanced Accuracy : 0.27728         
##                                           
##        'Positive' Class : Yes             
## 
# complex model ROC
complex1.roc.Valid<-roc(response=testingDataframe$fracture,predictor=validateComplexPred,levels=c("No","Yes"))
## Setting direction: controls < cases
# plot ROC
plot(complex1.roc.Valid,print.thres="best",col="red", main = "Best threshold for validation data set")